"""
Central registry for text splitters.

This module provides a factory function to create different types of text splitters
based on configuration, similar to how embeddings_config.py works for embeddings.
"""

from typing import Optional, List, Any
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    TokenTextSplitter,
    SentenceTransformersTokenTextSplitter,
)
from langchain_core.embeddings import Embeddings
from loguru import logger

# Valid splitter type options
VALID_SPLITTER_TYPES = [
    "recursive",
    "token",
    "sentence",
    "semantic",
]


def get_text_splitter(
    splitter_type: str = "recursive",
    chunk_size: int = 1000,
    chunk_overlap: int = 200,
    text_separators: Optional[List[str]] = None,
    embeddings: Optional[Embeddings] = None,
    **kwargs,
) -> Any:
    """
    Get text splitter based on type.

    Args:
        splitter_type: Type of splitter ('recursive', 'token', 'sentence', 'semantic')
        chunk_size: Maximum size of chunks
        chunk_overlap: Overlap between chunks
        text_separators: Custom separators (only used for 'recursive' type)
        embeddings: Embeddings instance (required for 'semantic' type)
        **kwargs: Additional splitter-specific parameters

    Returns:
        A text splitter instance

    Raises:
        ValueError: If splitter_type is invalid or required parameters are missing
        ImportError: If required dependencies are not installed
    """
    # Normalize splitter type
    splitter_type = splitter_type.strip().lower()

    # Validate splitter type
    if splitter_type not in VALID_SPLITTER_TYPES:
        logger.error(f"Invalid splitter type: {splitter_type}")
        raise ValueError(
            f"Invalid splitter type: {splitter_type}. "
            f"Must be one of: {VALID_SPLITTER_TYPES}"
        )

    logger.info(
        f"Creating text splitter: type={splitter_type}, "
        f"chunk_size={chunk_size}, chunk_overlap={chunk_overlap}"
    )

    # Create the appropriate splitter
    if splitter_type == "token":
        return TokenTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
        )

    elif splitter_type == "sentence":
        return SentenceTransformersTokenTextSplitter(
            chunk_overlap=chunk_overlap,
            tokens_per_chunk=chunk_size,
        )

    elif splitter_type == "semantic":
        # Semantic chunking requires embeddings
        if embeddings is None:
            raise ValueError(
                "Semantic splitter requires 'embeddings' parameter. "
                "Please provide an embeddings instance."
            )

        try:
            # Try to import experimental semantic chunker
            from langchain_experimental.text_splitter import SemanticChunker

            # Get breakpoint threshold from kwargs or use default
            breakpoint_threshold_type = kwargs.get(
                "breakpoint_threshold_type", "percentile"
            )
            breakpoint_threshold_amount = kwargs.get(
                "breakpoint_threshold_amount", None
            )

            # Create semantic chunker
            chunker_kwargs = {"embeddings": embeddings}

            if breakpoint_threshold_type:
                chunker_kwargs["breakpoint_threshold_type"] = (
                    breakpoint_threshold_type
                )

            if breakpoint_threshold_amount is not None:
                chunker_kwargs["breakpoint_threshold_amount"] = (
                    breakpoint_threshold_amount
                )

            logger.info(
                f"Creating SemanticChunker with threshold_type={breakpoint_threshold_type}, "
                f"threshold_amount={breakpoint_threshold_amount}"
            )

            return SemanticChunker(**chunker_kwargs)

        except ImportError as e:
            logger.exception("Failed to import SemanticChunker")
            raise ImportError(
                "Semantic chunking requires langchain-experimental. "
                "Install it with: pip install langchain-experimental"
            ) from e

    else:  # "recursive" or default
        # Use custom separators if provided, otherwise use defaults
        if text_separators is None:
            text_separators = ["\n\n", "\n", ". ", " ", ""]

        return RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=text_separators,
        )


def is_semantic_chunker_available() -> bool:
    """Check if semantic chunking is available."""
    import importlib.util

    return importlib.util.find_spec("langchain_experimental") is not None
